August 2019
Dataset: 10k PBMCs from a Healthy Donor available from 10x Genomics (here).
This notebook uses a loom file generated from the first part of the SCENIC protocol, described in: PBMC10k_SCENIC-protocol-CLI.ipynb
# import dependencies
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE
import json
import base64
import zlib
from pyscenic.plotting import plot_binarization
from pyscenic.export import add_scenic_metadata
from pyscenic.cli.utils import load_signatures
import matplotlib as mpl
import matplotlib.pyplot as plt
from scanpy.plotting._tools.scatterplots import plot_scatter
import seaborn as sns
# set a working directory
wdir = "/mnt/beegfs/data/users/vsc32528/pbmc10kprotocol/"
wdir = '/ddn1/vol1/staging/leuven/stg_00002/lcb/cflerin/testruns/scenic-protocol/prottest4'
os.chdir( wdir )
# path to loom output, generated from a combination of Scanpy and pySCENIC results:
f_final_loom = 'PBMC10k.loom'
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=150)
# scenic output
lf = lp.connect( f_final_loom, mode='r', validate=False )
meta = json.loads(zlib.decompress(base64.b64decode( lf.attrs.MetaData )))
exprMat = pd.DataFrame( lf[:,:], index=lf.ra.Gene, columns=lf.ca.CellID).T
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
# create a dictionary of regulons:
regulons = {}
for i,r in pd.DataFrame(lf.ra.Regulons,index=lf.ra.Gene).iteritems():
regulons[i] = list(r[r==1].index.values)
# cell annotations from the loom column attributes:
cellAnnot = pd.concat(
[
pd.DataFrame( lf.ca.Celltype_Garnett, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.ClusterID, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.Louvain_clusters_Scanpy, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.Percent_mito, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.nGene, index=lf.ca.CellID ),
pd.DataFrame( lf.ca.nUMI, index=lf.ca.CellID ),
],
axis=1
)
cellAnnot.columns = [
'Celltype_Garnett',
'ClusterID',
'Louvain_clusters_Scanpy',
'Percent_mito',
'nGene',
'nUMI']
# capture embeddings:
dr = [
pd.DataFrame( lf.ca.Embedding, index=lf.ca.CellID )
]
dr_names = [
meta['embeddings'][0]['name'].replace(" ","_")
]
# add other embeddings
drx = pd.DataFrame( lf.ca.Embeddings_X, index=lf.ca.CellID )
dry = pd.DataFrame( lf.ca.Embeddings_Y, index=lf.ca.CellID )
for i in range( len(drx.columns) ):
dr.append( pd.concat( [ drx.iloc[:,i], dry.iloc[:,i] ], sort=False, axis=1, join='outer' ))
dr_names.append( meta['embeddings'][i+1]['name'].replace(" ","_").replace('/','-') )
# rename columns:
for i,x in enumerate( dr ):
x.columns = ['X','Y']
lf.close()
scanpy.AnnData object¶This can be done directly from the integrated loom file, with a few modifications to allow for SCENIC- and SCope-specific loom attributes:
adata = sc.read( f_final_loom, validate=False)
# drop the embeddings and extra attributes from the obs object
adata.obs.drop( ['Embedding','Embeddings_X','Embeddings_Y','RegulonsAUC'], axis=1, inplace=True )
# add the embeddings into the adata.obsm object
for i,x in enumerate( dr ):
adata.obsm[ 'X_'+dr_names[i] ] = x.as_matrix()
sc.utils.sanitize_anndata( adata )
scanpy.AnnData object.¶# load the regulons from a file using the load_signatures function
sig = load_signatures('reg.csv')
add_scenic_metadata(adata, auc_mtx, sig)
# helper functions (not yet integrated into pySCENIC):
from pyscenic.utils import load_motifs
import operator as op
from IPython.display import HTML, display
BASE_URL = "http://motifcollections.aertslab.org/v9/logos/"
COLUMN_NAME_LOGO = "MotifLogo"
COLUMN_NAME_MOTIF_ID = "MotifID"
COLUMN_NAME_TARGETS = "TargetGenes"
def display_logos(df: pd.DataFrame, top_target_genes: int = 3, base_url: str = BASE_URL):
"""
:param df:
:param base_url:
"""
# Make sure the original dataframe is not altered.
df = df.copy()
# Add column with URLs to sequence logo.
def create_url(motif_id):
return '<img src="{}{}.png" style="max-height:124px;"></img>'.format(base_url, motif_id)
df[("Enrichment", COLUMN_NAME_LOGO)] = list(map(create_url, df.index.get_level_values(COLUMN_NAME_MOTIF_ID)))
# Truncate TargetGenes.
def truncate(col_val):
return sorted(col_val, key=op.itemgetter(1))[:top_target_genes]
df[("Enrichment", COLUMN_NAME_TARGETS)] = list(map(truncate, df[("Enrichment", COLUMN_NAME_TARGETS)]))
MAX_COL_WIDTH = pd.get_option('display.max_colwidth')
pd.set_option('display.max_colwidth', 200)
display(HTML(df.head().to_html(escape=False)))
pd.set_option('display.max_colwidth', MAX_COL_WIDTH)
df_motifs = load_motifs('reg.csv')
selected_motifs = ['PAX5','TCF3','EBF1']
df_motifs_sel = df_motifs.iloc[ [ True if x in selected_motifs else False for x in df_motifs.index.get_level_values('TF') ] ,:]
#display_logos(df_motifs.head())
display_logos( df_motifs_sel.sort_values([('Enrichment','NES')], ascending=False).head(9))
sc.set_figure_params(frameon=False, dpi=150, fontsize=10)
sc.pl.scatter( adata, basis='highly_variable_genes_UMAP',
color=['Louvain_clusters_Scanpy','Celltype_Garnett'],
title=['HVG - UMAP (Louvain clusters)','HVG - UMAP (Cell type)'],
alpha=0.8,
save='_Louvain-celltype.svg'
)
sc.pl.scatter( adata, basis='SCENIC_AUC_UMAP',
color=['Louvain_clusters_Scanpy','Celltype_Garnett'],
title=['SCENIC - UMAP (Louvain clusters)','SCENIC - UMAP (Cell type)'],
alpha=0.8,
save='_Louvain-celltype.svg'
)
(this uses non-Scanpy plotting functions)
def colorMap( x, palette='bright' ):
import natsort
from collections import OrderedDict
#
n=len(set(x))
cpalette = sns.color_palette(palette,n_colors=n )
cdict = dict( zip( list(set(x)), cpalette ))
cmap = [ cdict[i] for i in x ]
cdict = OrderedDict( natsort.natsorted(cdict.items()) )
return cmap,cdict
def drplot( dr, colorlab, ax, palette='bright', title=None, **kwargs ):
cmap,cdict = colorMap( colorlab, palette )
for lab,col in cdict.items():
ix = colorlab.loc[colorlab==lab].index
ax.scatter( dr['X'][ix], dr['Y'][ix], c=[col]*len(ix), alpha=0.7, label=lab, edgecolors='none')
if( title is not None ):
ax.set_title(title, fontsize='x-large');
#
ax.set_xticks([])
ax.set_yticks([])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
plt.rcParams.update({'font.size':12})
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(20,10), dpi=150 )
drplot( dr[0], colorlab=cellAnnot['Louvain_clusters_Scanpy'], ax=ax1, palette='bright', s=2, title='Highly variable genes - UMAP' )
drplot( dr[4], colorlab=cellAnnot['Louvain_clusters_Scanpy'], ax=ax2, palette='bright', s=2, title='SCENIC AUC - UMAP' )
ax2.legend(loc='right', bbox_to_anchor=(1.15, 0.5), ncol=1, markerscale=2, fontsize='x-large', frameon=False, title="Louvain\nclusters")
plt.tight_layout()
plt.savefig("PBMC10k_dimred_umap-hvg-scenic-louvain.png", dpi=150, bbox_inches = "tight")
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_rss
import matplotlib.pyplot as plt
from adjustText import adjust_text
import seaborn as sns
from pyscenic.binarization import binarize
rss_cellType = regulon_specificity_scores( auc_mtx, cellAnnot['Celltype_Garnett'] )
rss_cellType
cats = sorted(list(set(cellAnnot['Celltype_Garnett'])))
fig = plt.figure(figsize=(15, 8))
for c,num in zip(cats, range(1,len(cats)+1)):
x=rss_cellType.T[c]
ax = fig.add_subplot(2,5,num)
plot_rss(rss_cellType, c, top_n=5, max_n=None, ax=ax)
ax.set_ylim( x.min()-(x.max()-x.min())*0.05 , x.max()+(x.max()-x.min())*0.05 )
for t in ax.texts:
t.set_fontsize(12)
ax.set_ylabel('')
ax.set_xlabel('')
adjust_text(ax.texts, autoalign='xy', ha='right', arrowprops=dict(arrowstyle='-',color='lightgrey'), precision=0.001 )
fig.text(0.5, 0.0, 'Regulon', ha='center', va='center', size='x-large')
fig.text(0.00, 0.5, 'Regulon specificity score (RSS)', ha='center', va='center', rotation='vertical', size='x-large')
plt.tight_layout()
plt.rcParams.update({
'figure.autolayout': True,
'figure.titlesize': 'large' ,
#'axes.labelsize': 'x-large',
#'axes.titlesize':'x-large',
'xtick.labelsize':'large',
'ytick.labelsize':'large'
})
plt.savefig("PBMC10k_cellType-RSS-top5.png", dpi=150, bbox_inches = "tight")
plt.show()
topreg = []
for i,c in enumerate(cats):
topreg.extend(
list(rss_cellType.T[c].sort_values(ascending=False)[:5].index)
)
topreg = list(set(topreg))
auc_mtx_Z = pd.DataFrame( index=auc_mtx.index )
for col in list(auc_mtx.columns):
auc_mtx_Z[ col ] = ( auc_mtx[col] - auc_mtx[col].mean()) / auc_mtx[col].std(ddof=0)
#auc_mtx_Z.sort_index(inplace=True)
def palplot(pal, names, colors=None, size=1):
n = len(pal)
f, ax = plt.subplots(1, 1, figsize=(n * size, size))
ax.imshow(np.arange(n).reshape(1, n),
cmap=mpl.colors.ListedColormap(list(pal)),
interpolation="nearest", aspect="auto")
ax.set_xticks(np.arange(n) - .5)
ax.set_yticks([-.5, .5])
ax.set_xticklabels([])
ax.set_yticklabels([])
colors = n * ['k'] if colors is None else colors
for idx, (name, color) in enumerate(zip(names, colors)):
ax.text(0.0+idx, 0.0, name, color=color, horizontalalignment='center', verticalalignment='center')
return f
colors = sns.color_palette('bright',n_colors=len(cats) )
colorsd = dict( zip( cats, colors ))
colormap = [ colorsd[x] for x in cellAnnot['Celltype_Garnett'] ]
sns.set()
sns.set(font_scale=0.8)
fig = palplot( colors, cats, size=1.0)
g = sns.clustermap(auc_mtx_Z[topreg], annot=False, square=False, linecolor='gray',
yticklabels=False, vmin=-2, vmax=6, row_colors=colormap,
cmap="YlGnBu", figsize=(21,16) )
sns.set(font_scale=0.9)
g.cax.set_visible(True)
g.ax_heatmap.set_ylabel('')
g.ax_heatmap.set_xlabel('')
binary_mtx, auc_thresholds = binarize( auc_mtx, num_workers=25 )
binary_mtx.head()
# select regulons:
r = [ 'RXRA_(+)', 'NFE2_(+)', 'ETV2_(+)' ]
fig, axs = plt.subplots(1, 3, figsize=(16, 5), dpi=150, sharey=False)
for i,ax in enumerate(axs):
sns.distplot(auc_mtx[ r[i] ], ax=ax, norm_hist=True, bins=100)
ax.plot( [ auc_thresholds[ r[i] ] ]*2, ax.get_ylim(), 'r:')
ax.title.set_text( r[i] )
ax.set_xlabel('')
fig.text(0.00, 0.5, 'Frequency', ha='center', va='center', rotation='vertical', size='x-large')
fig.text(0.5, 0.0, 'AUC', ha='center', va='center', rotation='horizontal', size='x-large')
fig.tight_layout()
fig.savefig("PBMC10k_cellType-binaryPlot2.png", dpi=150, bbox_inches = "tight")
rss_louvain = regulon_specificity_scores( auc_mtx, cellAnnot['Louvain_clusters_Scanpy'] )
rss_louvain
cats = sorted( list(set(cellAnnot['Louvain_clusters_Scanpy'])), key=int )
fig = plt.figure(figsize=(15, 12))
for c,num in zip(cats, range(1,len(cats)+1)):
x=rss_louvain.T[c]
ax = fig.add_subplot(3,5,num)
plot_rss(rss_louvain, c, top_n=5, max_n=None, ax=ax)
ax.set_ylim( x.min()-(x.max()-x.min())*0.05 , x.max()+(x.max()-x.min())*0.05 )
for t in ax.texts:
t.set_fontsize(12)
ax.set_ylabel('')
ax.set_xlabel('')
adjust_text(ax.texts, autoalign='xy', ha='right', arrowprops=dict(arrowstyle='-',color='lightgrey'), precision=0.001 )
fig.text(0.5, 0.0, 'Regulon', ha='center', va='center', size='x-large')
fig.text(0.00, 0.5, 'Regulon specificity score (RSS)', ha='center', va='center', rotation='vertical', size='x-large')
plt.tight_layout()
plt.rcParams.update({
'figure.autolayout': True,
'figure.titlesize': 'large' ,
'xtick.labelsize':'large',
'ytick.labelsize':'large'
})
plt.savefig("PBMC10k_Louvain-RSS-top5.png", dpi=150, bbox_inches = "tight")
plt.show()
topreg = []
for i,c in enumerate(cats):
topreg.extend(
list(rss_louvain.T[c].sort_values(ascending=False)[:5].index)
)
topreg = list(set(topreg))
def palplot(pal, names, colors=None, size=1):
n = len(pal)
f, ax = plt.subplots(1, 1, figsize=(n * size, size))
ax.imshow(np.arange(n).reshape(1, n),
cmap=mpl.colors.ListedColormap(list(pal)),
interpolation="nearest", aspect="auto")
ax.set_xticks(np.arange(n) - .5)
ax.set_yticks([-.5, .5])
ax.set_xticklabels([])
ax.set_yticklabels([])
colors = n * ['k'] if colors is None else colors
for idx, (name, color) in enumerate(zip(names, colors)):
ax.text(0.0+idx, 0.0, name, color=color, horizontalalignment='center', verticalalignment='center')
return f
colors = sns.color_palette('bright',n_colors=len(cats) )
colorsd = dict( zip( cats, colors ))
colormap = [ colorsd[x] for x in cellAnnot['Louvain_clusters_Scanpy'] ]
sns.set()
sns.set(font_scale=0.8)
fig = palplot( colors, cats, size=1.0)
g = sns.clustermap(auc_mtx_Z[topreg], annot=False, square=False, linecolor='gray',
yticklabels=False, vmin=-2, vmax=6, row_colors=colormap,
cmap="YlGnBu", figsize=(21,16) )
sns.set(font_scale=0.9)
g.cax.set_visible(True)
g.ax_heatmap.set_ylabel('')
g.ax_heatmap.set_xlabel('')
adjacencies = pd.read_csv("adj.tsv", index_col=False, sep='\t')
Create the modules:
from pyscenic.utils import modules_from_adjacencies
modules = list(modules_from_adjacencies(adjacencies, exprMat))
tf = 'EBF1'
tf_mods = [ x for x in modules if x.transcription_factor==tf ]
for i,mod in enumerate( tf_mods ):
print( f'{tf} module {str(i)}: {len(mod.genes)} genes' )
print( f'{tf} regulon: {len(regulons[tf+"_(+)"])} genes' )
write these modules, and the regulon to files:
for i,mod in enumerate( tf_mods ):
with open( tf+'_module_'+str(i)+'.txt', 'w') as f:
for item in mod.genes:
f.write("%s\n" % item)
with open( tf+'_regulon.txt', 'w') as f:
for item in regulons[tf+'_(+)']:
f.write("%s\n" % item)
from IPython.display import display, Image
display(Image(filename='iRegulon_screenshot_PBMC10k-EBF1.png'))